In [28]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys
import plotly.express as px
import os
import seaborn as sns
import json
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.offline as pyo
pyo.init_notebook_mode()
sys.tracebacklimit = 0 # turn off the error tracebacks
In [29]:
# Problem 1 - Get user agent and make headers dictionary
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-Agent': 'data science passion project (vrd9sd@virginia.edu) (Language=Python 3.8.2; platform=Mac OSX 13.0.1'}
# Url to get HTML from
url = 'https://waterdata.usgs.gov/va/nwis/current/?type=gw'
r = requests.get(url, headers=headers)
# Parsing HTML code
mysoup = BeautifulSoup(r.text, 'html.parser')
In [30]:
# make empty list of counties
counties=[]
# populate county list with one instance for each site number under that county
for td in mysoup.find_all('td'):
if 'colspan' in td.attrs:
county_name = td.strong.text
current_county = county_name
elif td.find('a') and td.find('a').get('href', '').startswith('/va/nwis/uv'):
counties.append(current_county[1:])
In [31]:
station_number = [x.string for x in mysoup.find_all(lambda tag: tag.has_attr('href') and tag['href'].startswith('/va/nwis/uv'))]
In [32]:
date_and_site = [x.string for x in mysoup.find_all('td', attrs = {'nowrap':'nowrap'})]
dates = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==1]
site_name = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==0]
water_depth_strings = [x.find_all('td')[3].string for x in mysoup.find_all('tr', attrs = {'align':'right'})]
water_depths = [item[:-1] if item != None else np.nan for item in water_depth_strings]
Making County names consistent with Fips Database¶
In [33]:
counties = [x.replace('Of', 'of') for x in counties] # uncapitalize 'Of' in places
counties = [x.replace('And', 'and') for x in counties] # uncapitalize 'And' for places
counties = [x[:-4] + 'city' if x.endswith('City') else x for x in counties] # uncapitalize 'City' in places
In [34]:
groundwater_data = pd.DataFrame({
'Jurisdiction': counties,
'station number': station_number,
'dates': dates,
'site_name': site_name,
'water table depth': water_depths})
Outputting to CSV File¶
In [35]:
file_path = 'water_table_depths.csv'
if not os.path.exists(file_path):
groundwater_data.to_csv(file_path, index=False)
else:
groundwater_data.to_csv(file_path, mode='a', header=False, index=False)
Make the date current eastern time datetime object¶
In [36]:
# from datetime import datetime
# from zoneinfo import ZoneInfo
# edt_timezone = ZoneInfo('US/Eastern')
# datetime_objects = []
# for date in dates:
# date_string = date.replace(' EDT', '') # strip edt since we set
# date_object = datetime.strptime(date_string, "%m/%d %H:%M") # parse into datetime object
# date_object = date_object.replace(year=datetime.now().year) # Add current year
# date_object = date_object.astimezone(edt_timezone) # localize to edt timezone
# datetime_objects.append(date_object) # append to list
# print(datetime_objects[-1])
Merging Fips-County data with groundwater_data¶
In [37]:
data = pd.read_csv('vacounties.csv') # read in vacounties data
va_fips = data.iloc[:, :2] #isolate county and FIPS columns
In [38]:
# Come back to this to merge correctly (counties named city turns into error)
merged_data = pd.merge(va_fips, groundwater_data, on='Jurisdiction', how='right', indicator=True) # merge groundwater data with counties
In [39]:
merged_data.query("_merge!='both'") # Identify Counties that dont have match in right df Should be empty
Out[39]:
| FIPS | Jurisdiction | station number | dates | site_name | water table depth | _merge |
|---|
In [40]:
merged_data['water table depth'] = merged_data['water table depth'].astype('float')
In [41]:
grouped = merged_data.groupby('Jurisdiction').agg(
mean_county_depth=('water table depth', 'mean'),
median_county_depth = ('water table depth', 'median'),
num_county_stations = ('water table depth', 'size'),
min_county_depth = ('water table depth', 'min'),
max_county_depth = ('water table depth', 'max'),
q25_county_depth = ('water table depth', lambda x: np.percentile(x, 75)),
q75_county_depth = ('water table depth', lambda x: np.percentile(x, 75)),
).reset_index()
grouped.head()
Out[41]:
| Jurisdiction | mean_county_depth | median_county_depth | num_county_stations | min_county_depth | max_county_depth | q25_county_depth | q75_county_depth | |
|---|---|---|---|---|---|---|---|---|
| 0 | Accomack County | 47.454167 | 41.540 | 12 | 9.30 | 101.23 | 75.0800 | 75.0800 |
| 1 | Appomattox County | -0.580000 | -0.580 | 1 | -0.58 | -0.58 | -0.5800 | -0.5800 |
| 2 | Augusta County | 15.600000 | 15.600 | 1 | 15.60 | 15.60 | 15.6000 | 15.6000 |
| 3 | Bath County | 5.860000 | 5.860 | 1 | 5.86 | 5.86 | 5.8600 | 5.8600 |
| 4 | Bedford County | 16.932500 | 12.265 | 4 | 6.10 | 37.10 | 20.0675 | 20.0675 |
In [42]:
merged_with_stats = pd.merge(merged_data, grouped, on='Jurisdiction', how='left')
merged_with_stats.head()
Out[42]:
| FIPS | Jurisdiction | station number | dates | site_name | water table depth | _merge | mean_county_depth | median_county_depth | num_county_stations | min_county_depth | max_county_depth | q25_county_depth | q75_county_depth | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51001 | Accomack County | 374425075400001 | 10/02 17:00 EDT | 65K 27 SOW 114A | 50.74 | both | 47.454167 | 41.54 | 12 | 9.3 | 101.23 | 75.08 | 75.08 |
| 1 | 51001 | Accomack County | 374425075400002 | 10/02 17:00 EDT | 65K 28 SOW 114B | 79.46 | both | 47.454167 | 41.54 | 12 | 9.3 | 101.23 | 75.08 | 75.08 |
| 2 | 51001 | Accomack County | 374425075400003 | 10/02 17:00 EDT | 65K 29 SOW 114C | 101.23 | both | 47.454167 | 41.54 | 12 | 9.3 | 101.23 | 75.08 | 75.08 |
| 3 | 51001 | Accomack County | 375315075332101 | 10/02 16:30 EDT | 66M 57 | 76.16 | both | 47.454167 | 41.54 | 12 | 9.3 | 101.23 | 75.08 | 75.08 |
| 4 | 51001 | Accomack County | 375315075332102 | 10/02 16:30 EDT | 66M 58 | 69.47 | both | 47.454167 | 41.54 | 12 | 9.3 | 101.23 | 75.08 | 75.08 |
Look at distribution of groundwater depths¶
In [43]:
sns.histplot(merged_with_stats['water table depth'], bins=9)
plt.xlabel('Water Table Depth (ft)')
plt.ylabel('Frequency')
plt.title('VA Water Table Depths at USGS monitoring locations')
plt.show()
Making Chloropleth of water table depths by counties¶
In [44]:
# Get geojson for county fips
import json
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
r = requests.get(url)
geojson_counties = json.loads(r.text)
In [45]:
# Showing Virginia choropleth for FIPS that are in merged groundwater_data
fig = px.choropleth(merged_with_stats,
geojson=geojson_counties,
color='num_county_stations',
scope='usa',
locations='FIPS',
hover_name = 'Jurisdiction',
color_continuous_scale = px.colors.sequential.Greys, # [::-1], # reverses the color scale
hover_data = ['num_county_stations', 'mean_county_depth'],
title = 'VA Water Table Depth monitoring stations by County Vizualization'
)
fig.update_geos(fitbounds='locations')
fig.show()
In [46]:
# remove counties with few monitoring stations
merged_with_stats = merged_with_stats[merged_with_stats['num_county_stations'] > 2]
In [47]:
fig2 = px.choropleth(merged_with_stats,
geojson=geojson_counties,
color='mean_county_depth',
scope='usa',
locations='FIPS',
hover_name = 'Jurisdiction',
color_continuous_scale = px.colors.sequential.PuBu, #[::-1], # reverses the color scale
hover_data = [ 'mean_county_depth', 'min_county_depth', 'max_county_depth','num_county_stations'],
title = 'Mean water table depths by VA County (> 2 stations)'
)
fig2.update_geos(fitbounds='locations')
fig2.show()
In [48]:
import plotly.io as pio
pio.write_html(fig, 'choropleth_mean_depth.html')
pio.write_html(fig2, 'choropleth_num_stations.html')
In [49]:
from IPython.display import IFrame
# Embed the first figure
IFrame('choropleth_mean_depth.html', width=800, height=600)
# Embed the second figure
IFrame('choropleth_num_stations.html', width=800, height=600)
Out[49]: